home *** CD-ROM | disk | FTP | other *** search
Text File | 1992-01-03 | 50.8 KB | 1,891 lines |
- Newsgroups: comp.sources.misc
- From: ylo@ngs.fi (Tatu Ylonen)
- Subject: v27i023: regexpr - regexp library compatible with gnu regex, Part01/01
- Message-ID: <1992Jan3.031320.2360@sparky.imd.sterling.com>
- X-Md4-Signature: 81e4c7ebca59ef67b5925ed8e7ecb2a4
- Date: Fri, 3 Jan 1992 03:13:20 GMT
- Approved: kent@sparky.imd.sterling.com
-
- Submitted-by: ylo@ngs.fi (Tatu Ylonen)
- Posting-number: Volume 27, Issue 23
- Archive-name: regexpr/part01
- Environment: UNIX, MSDOS
-
- Regexpr is a regular expression package. It is free (meaning that you
- may do anything you want with it); the original motivation for writing
- it was not being able to use the GNU library in a commercial
- application.
-
- Some of the features include:
- - fully compatible with gnu regex library (I run emacs with this
- library for several weeks as a test)
- - can handle arbitrary data, including binary characters
- - can handle split data
- - compiles and runs also on 16 bit machines (eg. MSDOS)
- - does not use alloca
- - fairly easy to extend and modify (easier than the gnu version anyway)
- - speed comparable to that of the GNU library (searches seem a bit
- faster, matches about the same and compiling a bit slower than in
- the gnu library)
- - there are some extensions (enabled if RE_ANSI_HEX is set in syntax):
- \vnn for accessing registers > 9 (useful if RE_NREGS > 10)
- \xhh specifies character in hex
- \a ascii 7
- \b ascii 8
- \f ascii 12
- \n ascii 10
- \r ascii 13
- \t ascii 9
- \v ascii 11
-
- I have not written any documentation; see the header file and
- documentation GNU Regex library in GNU Emacs distribution.
-
- Send comments, bug fixes and suggestions to Tatu Ylonen
- <ylo@cs.hut.fi>.
-
- #! /bin/sh
- # This is a shell archive. Remove anything before this line, then unpack
- # it by saving it into a file and typing "sh file". To overwrite existing
- # files, type "sh file -c". You can also feed this as standard input via
- # unshar, or by typing "sh <file", e.g.. If this archive is complete, you
- # will see the following message at the end:
- # "End of shell archive."
- # Contents: regexpr.h regexpr.c
- # Wrapped by ylo@ngs.fi on Mon Dec 30 09:18:58 1991
- PATH=/bin:/usr/bin:/usr/ucb ; export PATH
- if test -f regexpr.h -a "${1}" != "-c" ; then
- echo shar: Will not over-write existing file \"regexpr.h\"
- else
- echo shar: Extracting \"regexpr.h\" \(5275 characters\)
- sed "s/^X//" >regexpr.h <<'END_OF_regexpr.h'
- X/*
- X
- Xregexpr.h
- X
- XAuthor: Tatu Ylonen <ylo@ngs.fi>
- X
- XCopyright (c) 1991 Tatu Ylonen, Espoo, Finland
- X
- XPermission to use, copy, modify, distribute, and sell this software
- Xand its documentation for any purpose is hereby granted without fee,
- Xprovided that the above copyright notice appear in all copies. This
- Xsoftware is provided "as is" without express or implied warranty.
- X
- XCreated: Thu Sep 26 17:15:36 1991 ylo
- XLast modified: Mon Nov 4 15:49:46 1991 ylo
- X
- X*/
- X
- X#ifndef REGEXPR_H
- X#define REGEXPR_H
- X
- X#define RE_NREGS 10 /* number of registers available */
- X
- Xtypedef struct re_pattern_buffer
- X{
- X char *buffer; /* compiled pattern */
- X int allocated; /* allocated size of compiled pattern */
- X int used; /* actual length of compiled pattern */
- X char *fastmap; /* fastmap[ch] is true if ch can start pattern */
- X char *translate; /* translation to apply during compilation/matching */
- X char fastmap_accurate; /* true if fastmap is valid */
- X char can_be_null; /* true if can match empty string */
- X char uses_registers; /* registers are used and need to be initialized */
- X char anchor; /* anchor: 0=none 1=begline 2=begbuf */
- X} *regexp_t;
- X
- Xtypedef struct re_registers
- X{
- X int start[RE_NREGS]; /* start offset of region */
- X int end[RE_NREGS]; /* end offset of region */
- X} *regexp_registers_t;
- X
- X/* bit definitions for syntax */
- X#define RE_NO_BK_PARENS 1 /* no quoting for parentheses */
- X#define RE_NO_BK_VBAR 2 /* no quoting for vertical bar */
- X#define RE_BK_PLUS_QM 4 /* quoting needed for + and ? */
- X#define RE_TIGHT_VBAR 8 /* | binds tighter than ^ and $ */
- X#define RE_NEWLINE_OR 16 /* treat newline as or */
- X#define RE_CONTEXT_INDEP_OPS 32 /* ^$?*+ are special in all contexts */
- X#define RE_ANSI_HEX 64 /* ansi sequences (\n etc) and \xhh */
- X#define RE_NO_GNU_EXTENSIONS 128 /* no gnu extensions */
- X
- X/* definitions for some common regexp styles */
- X#define RE_SYNTAX_AWK (RE_NO_BK_PARENS|RE_NO_BK_VBAR|RE_CONTEXT_INDEP_OPS)
- X#define RE_SYNTAX_EGREP (RE_SYNTAX_AWK|RE_NEWLINE_OR)
- X#define RE_SYNTAX_GREP (RE_BK_PLUS_QM|RE_NEWLINE_OR)
- X#define RE_SYNTAX_EMACS 0
- X
- X#ifdef __STDC__
- X
- Xint re_set_syntax(int syntax);
- X/* This sets the syntax to use and returns the previous syntax. The
- X syntax is specified by a bit mask of the above defined bits. */
- X
- Xchar *re_compile_pattern(char *regex, int regex_size, regexp_t compiled);
- X/* This compiles the regexp (given in regex and length in regex_size).
- X This returns NULL if the regexp compiled successfully, and an error
- X message if an error was encountered. The buffer field must be
- X initialized to a memory area allocated by malloc (or to NULL) before
- X use, and the allocated field must be set to its length (or 0 if buffer is
- X NULL). Also, the translate field must be set to point to a valid
- X translation table, or NULL if it is not used. */
- X
- Xint re_match(regexp_t compiled, char *string, int size, int pos,
- X regexp_registers_t regs);
- X/* This tries to match the regexp against the string. This returns the
- X length of the matched portion, or -1 if the pattern could not be
- X matched and -2 if an error (such as failure stack overflow) is
- X encountered. */
- X
- Xint re_match_2(regexp_t compiled, char *string1, int size1,
- X char *string2, int size2, int pos, regexp_registers_t regs,
- X int mstop);
- X/* This tries to match the regexp to the concatenation of string1 and
- X string2. This returns the length of the matched portion, or -1 if the
- X pattern could not be matched and -2 if an error (such as failure stack
- X overflow) is encountered. */
- X
- Xint re_search(regexp_t compiled, char *string, int size, int startpos,
- X int range, regexp_registers_t regs);
- X/* This rearches for a substring matching the regexp. This returns the first
- X index at which a match is found. range specifies at how many positions to
- X try matching; positive values indicate searching forwards, and negative
- X values indicate searching backwards. mstop specifies the offset beyond
- X which a match must not go. This returns -1 if no match is found, and
- X -2 if an error (such as failure stack overflow) is encountered. */
- X
- Xint re_search_2(regexp_t compiled, char *string1, int size1,
- X char *string2, int size2, int startpos, int range,
- X regexp_registers_t regs, int mstop);
- X/* This is like re_search, but search from the concatenation of string1 and
- X string2. */
- X
- Xvoid re_compile_fastmap(regexp_t compiled);
- X/* This computes the fastmap for the regexp. For this to have any effect,
- X the calling program must have initialized the fastmap field to point
- X to an array of 256 characters. */
- X
- Xchar *re_comp(char *s);
- X/* BSD 4.2 regex library routine re_comp. This compiles the regexp into
- X an internal buffer. This returns NULL if the regexp was compiled
- X successfully, and an error message if there was an error. */
- X
- Xint re_exec(char *s);
- X/* BSD 4.2 regexp library routine re_exec. This returns true if the string
- X matches the regular expression (that is, a matching part is found
- X anywhere in the string). */
- X
- X#else /* __STDC__ */
- X
- Xint re_set_syntax();
- Xchar *re_compile_pattern();
- Xint re_match();
- Xint re_match_2();
- Xint re_search();
- Xint re_search_2();
- Xvoid re_compile_fastmap();
- Xchar *re_comp();
- Xint re_exec();
- X
- X#endif /* __STDC__ */
- X
- X#endif /* REGEXPR_H */
- X
- X
- END_OF_regexpr.h
- if test 5275 -ne `wc -c <regexpr.h`; then
- echo shar: \"regexpr.h\" unpacked with wrong size!
- fi
- # end of overwriting check
- fi
- if test -f regexpr.c -a "${1}" != "-c" ; then
- echo shar: Will not over-write existing file \"regexpr.c\"
- else
- echo shar: Extracting \"regexpr.c\" \(41626 characters\)
- sed "s/^X//" >regexpr.c <<'END_OF_regexpr.c'
- X/*
- X
- Xregexpr.c
- X
- XAuthor: Tatu Ylonen <ylo@ngs.fi>
- X
- XCopyright (c) 1991 Tatu Ylonen, Espoo, Finland
- X
- XPermission to use, copy, modify, distribute, and sell this software
- Xand its documentation for any purpose is hereby granted without fee,
- Xprovided that the above copyright notice appear in all copies. This
- Xsoftware is provided "as is" without express or implied warranty.
- X
- XCreated: Thu Sep 26 17:14:05 1991 ylo
- XLast modified: Mon Nov 4 17:06:48 1991 ylo
- X
- XThis code draws many ideas from the regular expression packages by
- XHenry Spencer of the University of Toronto and Richard Stallman of the
- XFree Software Foundation.
- X
- XEmacs-specific code and syntax table code is almost directly borrowed
- Xfrom GNU regexp.
- X
- X$Header: /u/src/lib/tools/RCS/regexpr.c,v 1.1 91/12/30 08:53:37 ylo Exp $
- X
- X*/
- X
- X#include <stdio.h>
- X#include <assert.h>
- X#include "regexpr.h"
- X
- Xchar *malloc();
- Xvoid free();
- Xchar *realloc();
- X
- X#define MACRO_BEGIN do {
- X#define MACRO_END } while (0)
- X
- Xenum regexp_compiled_ops /* opcodes for compiled regexp */
- X{
- X Cend, /* end of pattern reached */
- X Cbol, /* beginning of line */
- X Ceol, /* end of line */
- X Cset, /* character set. Followed by 32 bytes of set. */
- X Cexact, /* followed by a byte to match */
- X Canychar, /* matches any character except newline */
- X Cstart_memory, /* set register start addr (followed by reg number) */
- X Cend_memory, /* set register end addr (followed by reg number) */
- X Cmatch_memory, /* match a duplicate of reg contents (regnum follows)*/
- X Cjump, /* followed by two bytes (lsb,msb) of displacement. */
- X Cstar_jump, /* will change to jump/update_failure_jump at runtime */
- X Cfailure_jump, /* jump to addr on failure */
- X Cupdate_failure_jump, /* update topmost failure point and jump */
- X Cdummy_failure_jump, /* push a dummy failure point and jump */
- X Cbegbuf, /* match at beginning of buffer */
- X Cendbuf, /* match at end of buffer */
- X Cwordbeg, /* match at beginning of word */
- X Cwordend, /* match at end of word */
- X Cwordbound, /* match if at word boundary */
- X Cnotwordbound, /* match if not at word boundary */
- X#ifdef emacs
- X Cemacs_at_dot, /* emacs only: matches at dot */
- X#endif /* emacs */
- X Csyntaxspec, /* matches syntax code (1 byte follows) */
- X Cnotsyntaxspec /* matches if syntax code does not match (1 byte foll)*/
- X};
- X
- Xenum regexp_syntax_op /* syntax codes for plain and quoted characters */
- X{
- X Rend, /* special code for end of regexp */
- X Rnormal, /* normal character */
- X Ranychar, /* any character except newline */
- X Rquote, /* the quote character */
- X Rbol, /* match beginning of line */
- X Reol, /* match end of line */
- X Roptional, /* match preceding expression optionally */
- X Rstar, /* match preceding expr zero or more times */
- X Rplus, /* match preceding expr one or more times */
- X Ror, /* match either of alternatives */
- X Ropenpar, /* opening parenthesis */
- X Rclosepar, /* closing parenthesis */
- X Rmemory, /* match memory register */
- X Rextended_memory, /* \vnn to match registers 10-99 */
- X Ropenset, /* open set. Internal syntax hard-coded below. */
- X /* the following are gnu extensions to "normal" regexp syntax */
- X Rbegbuf, /* beginning of buffer */
- X Rendbuf, /* end of buffer */
- X Rwordchar, /* word character */
- X Rnotwordchar, /* not word character */
- X Rwordbeg, /* beginning of word */
- X Rwordend, /* end of word */
- X Rwordbound, /* word bound */
- X Rnotwordbound, /* not word bound */
- X#ifdef emacs
- X Remacs_at_dot, /* emacs: at dot */
- X Remacs_syntaxspec, /* syntaxspec */
- X Remacs_notsyntaxspec, /* notsyntaxspec */
- X#endif /* emacs */
- X Rnum_ops
- X};
- X
- Xstatic int re_compile_initialized = 0;
- Xstatic int regexp_syntax = 0;
- Xstatic unsigned char regexp_plain_ops[256];
- Xstatic unsigned char regexp_quoted_ops[256];
- Xstatic unsigned char regexp_precedences[Rnum_ops];
- Xstatic int regexp_context_indep_ops;
- Xstatic int regexp_ansi_sequences;
- X
- X#define NUM_LEVELS 5 /* number of precedence levels in use */
- X#define MAX_NESTING 100 /* max nesting level of operators */
- X
- X#ifdef emacs
- X
- X/* This code is for emacs compatibility only. */
- X
- X#include "config.h"
- X#include "lisp.h"
- X#include "buffer.h"
- X#include "syntax.h"
- X
- X/* emacs defines NULL in some strange way? */
- X#undef NULL
- X#define NULL 0
- X
- X#else /* emacs */
- X
- X#define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
- X#define Sword 1
- X
- X#ifdef SYNTAX_TABLE
- Xchar *re_syntax_table;
- X#else
- Xstatic char re_syntax_table[256];
- X#endif /* SYNTAX_TABLE */
- X
- X#endif /* emacs */
- X
- Xstatic void re_compile_initialize()
- X{
- X int a;
- X
- X#if !defined(emacs) && !defined(SYNTAX_TABLE)
- X static int syntax_table_inited = 0;
- X
- X if (!syntax_table_inited)
- X {
- X syntax_table_inited = 1;
- X memset(re_syntax_table, 0, 256);
- X for (a = 'a'; a <= 'z'; a++)
- X re_syntax_table[a] = Sword;
- X for (a = 'A'; a <= 'Z'; a++)
- X re_syntax_table[a] = Sword;
- X for (a = '0'; a <= '9'; a++)
- X re_syntax_table[a] = Sword;
- X }
- X#endif /* !emacs && !SYNTAX_TABLE */
- X re_compile_initialized = 1;
- X for (a = 0; a < 256; a++)
- X {
- X regexp_plain_ops[a] = Rnormal;
- X regexp_quoted_ops[a] = Rnormal;
- X }
- X for (a = '0'; a <= '9'; a++)
- X regexp_quoted_ops[a] = Rmemory;
- X regexp_plain_ops['\134'] = Rquote;
- X if (regexp_syntax & RE_NO_BK_PARENS)
- X {
- X regexp_plain_ops['('] = Ropenpar;
- X regexp_plain_ops[')'] = Rclosepar;
- X }
- X else
- X {
- X regexp_quoted_ops['('] = Ropenpar;
- X regexp_quoted_ops[')'] = Rclosepar;
- X }
- X if (regexp_syntax & RE_NO_BK_VBAR)
- X regexp_plain_ops['\174'] = Ror;
- X else
- X regexp_quoted_ops['\174'] = Ror;
- X regexp_plain_ops['*'] = Rstar;
- X if (regexp_syntax & RE_BK_PLUS_QM)
- X {
- X regexp_quoted_ops['+'] = Rplus;
- X regexp_quoted_ops['?'] = Roptional;
- X }
- X else
- X {
- X regexp_plain_ops['+'] = Rplus;
- X regexp_plain_ops['?'] = Roptional;
- X }
- X if (regexp_syntax & RE_NEWLINE_OR)
- X regexp_plain_ops['\n'] = Ror;
- X regexp_plain_ops['\133'] = Ropenset;
- X regexp_plain_ops['\136'] = Rbol;
- X regexp_plain_ops['$'] = Reol;
- X regexp_plain_ops['.'] = Ranychar;
- X if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS))
- X {
- X#ifdef emacs
- X regexp_quoted_ops['='] = Remacs_at_dot;
- X regexp_quoted_ops['s'] = Remacs_syntaxspec;
- X regexp_quoted_ops['S'] = Remacs_notsyntaxspec;
- X#endif /* emacs */
- X regexp_quoted_ops['w'] = Rwordchar;
- X regexp_quoted_ops['W'] = Rnotwordchar;
- X regexp_quoted_ops['<'] = Rwordbeg;
- X regexp_quoted_ops['>'] = Rwordend;
- X regexp_quoted_ops['b'] = Rwordbound;
- X regexp_quoted_ops['B'] = Rnotwordbound;
- X regexp_quoted_ops['`'] = Rbegbuf;
- X regexp_quoted_ops['\''] = Rendbuf;
- X }
- X if (regexp_syntax & RE_ANSI_HEX)
- X regexp_quoted_ops['v'] = Rextended_memory;
- X for (a = 0; a < Rnum_ops; a++)
- X regexp_precedences[a] = 4;
- X if (regexp_syntax & RE_TIGHT_VBAR)
- X {
- X regexp_precedences[Ror] = 3;
- X regexp_precedences[Rbol] = 2;
- X regexp_precedences[Reol] = 2;
- X }
- X else
- X {
- X regexp_precedences[Ror] = 2;
- X regexp_precedences[Rbol] = 3;
- X regexp_precedences[Reol] = 3;
- X }
- X regexp_precedences[Rclosepar] = 1;
- X regexp_precedences[Rend] = 0;
- X regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
- X regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
- X}
- X
- Xint re_set_syntax(syntax)
- Xint syntax;
- X{
- X int ret;
- X
- X ret = regexp_syntax;
- X regexp_syntax = syntax;
- X re_compile_initialize();
- X return ret;
- X}
- X
- Xstatic int hex_char_to_decimal(ch)
- Xint ch;
- X{
- X if (ch >= '0' && ch <= '9')
- X return ch - '0';
- X if (ch >= 'a' && ch <= 'f')
- X return ch - 'a' + 10;
- X if (ch >= 'A' && ch <= 'F')
- X return ch - 'A' + 10;
- X return 16;
- X}
- X
- Xchar *re_compile_pattern(regex, size, bufp)
- Xchar *regex;
- Xint size;
- Xregexp_t bufp;
- X{
- X int a, pos, op, current_level, level, opcode;
- X int pattern_offset, alloc;
- X int starts[NUM_LEVELS * MAX_NESTING], starts_base;
- X int future_jumps[MAX_NESTING], num_jumps;
- X unsigned char ch;
- X char *pattern, *translate;
- X int next_register, paren_depth, num_open_registers, open_registers[RE_NREGS];
- X int beginning_context;
- X
- X#define NEXTCHAR(var) \
- X MACRO_BEGIN \
- X if (pos >= size) \
- X goto ends_prematurely; \
- X (var) = regex[pos]; \
- X pos++; \
- X MACRO_END
- X
- X#define ALLOC(amount) \
- X MACRO_BEGIN \
- X if (pattern_offset+(amount) > alloc) \
- X { \
- X alloc += 256 + (amount); \
- X pattern = realloc(pattern, alloc); \
- X if (!pattern) \
- X goto out_of_memory; \
- X } \
- X MACRO_END
- X
- X#define STORE(ch) pattern[pattern_offset++] = (ch)
- X
- X#define CURRENT_LEVEL_START (starts[starts_base + current_level])
- X
- X#define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
- X
- X#define PUSH_LEVEL_STARTS if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
- X starts_base += NUM_LEVELS; \
- X else \
- X goto too_complex
- X
- X#define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
- X
- X#define PUT_ADDR(offset,addr) \
- X MACRO_BEGIN \
- X int disp = (addr) - (offset) - 2; \
- X pattern[(offset)] = disp & 0xff; \
- X pattern[(offset)+1] = (disp>>8) & 0xff; \
- X MACRO_END
- X
- X#define INSERT_JUMP(pos,type,addr) \
- X MACRO_BEGIN \
- X int a, p = (pos), t = (type), ad = (addr); \
- X for (a = pattern_offset - 1; a >= p; a--) \
- X pattern[a + 3] = pattern[a]; \
- X pattern[p] = t; \
- X PUT_ADDR(p+1,ad); \
- X pattern_offset += 3; \
- X MACRO_END
- X
- X#define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
- X
- X#define SET_FIELDS \
- X MACRO_BEGIN \
- X bufp->allocated = alloc; \
- X bufp->buffer = pattern; \
- X bufp->used = pattern_offset; \
- X MACRO_END
- X
- X#define GETHEX(var) \
- X MACRO_BEGIN \
- X char gethex_ch, gethex_value; \
- X NEXTCHAR(gethex_ch); \
- X gethex_value = hex_char_to_decimal(gethex_ch); \
- X if (gethex_value == 16) \
- X goto hex_error; \
- X NEXTCHAR(gethex_ch); \
- X gethex_ch = hex_char_to_decimal(gethex_ch); \
- X if (gethex_ch == 16) \
- X goto hex_error; \
- X (var) = gethex_value * 16 + gethex_ch; \
- X MACRO_END
- X
- X#define ANSI_TRANSLATE(ch) \
- X MACRO_BEGIN \
- X switch (ch) \
- X { \
- X case 'a': \
- X case 'A': \
- X ch = 7; /* audible bell */ \
- X break; \
- X case 'b': \
- X case 'B': \
- X ch = 8; /* backspace */ \
- X break; \
- X case 'f': \
- X case 'F': \
- X ch = 12; /* form feed */ \
- X break; \
- X case 'n': \
- X case 'N': \
- X ch = 10; /* line feed */ \
- X break; \
- X case 'r': \
- X case 'R': \
- X ch = 13; /* carriage return */ \
- X break; \
- X case 't': \
- X case 'T': \
- X ch = 9; /* tab */ \
- X break; \
- X case 'v': \
- X case 'V': \
- X ch = 11; /* vertical tab */ \
- X break; \
- X case 'x': /* hex code */ \
- X case 'X': \
- X GETHEX(ch); \
- X break; \
- X default: \
- X /* other characters passed through */ \
- X if (translate) \
- X ch = translate[(unsigned char)ch]; \
- X break; \
- X } \
- X MACRO_END
- X
- X if (!re_compile_initialized)
- X re_compile_initialize();
- X bufp->used = 0;
- X bufp->fastmap_accurate = 0;
- X bufp->uses_registers = 0;
- X translate = bufp->translate;
- X pattern = bufp->buffer;
- X alloc = bufp->allocated;
- X if (alloc == 0 || pattern == NULL)
- X {
- X alloc = 256;
- X pattern = malloc(alloc);
- X if (!pattern)
- X goto out_of_memory;
- X }
- X pattern_offset = 0;
- X starts_base = 0;
- X num_jumps = 0;
- X current_level = 0;
- X SET_LEVEL_START;
- X num_open_registers = 0;
- X next_register = 1;
- X paren_depth = 0;
- X beginning_context = 1;
- X op = -1;
- X /* we use Rend dummy to ensure that pending jumps are updated (due to
- X low priority of Rend) before exiting the loop. */
- X pos = 0;
- X while (op != Rend)
- X {
- X if (pos >= size)
- X op = Rend;
- X else
- X {
- X NEXTCHAR(ch);
- X if (translate)
- X ch = translate[(unsigned char)ch];
- X op = regexp_plain_ops[(unsigned char)ch];
- X if (op == Rquote)
- X {
- X NEXTCHAR(ch);
- X op = regexp_quoted_ops[(unsigned char)ch];
- X if (op == Rnormal && regexp_ansi_sequences)
- X ANSI_TRANSLATE(ch);
- X }
- X }
- X level = regexp_precedences[op];
- X /* printf("ch='%c' op=%d level=%d current_level=%d curlevstart=%d\n",
- X ch, op, level, current_level, CURRENT_LEVEL_START); */
- X if (level > current_level)
- X {
- X for (current_level++; current_level < level; current_level++)
- X SET_LEVEL_START;
- X SET_LEVEL_START;
- X }
- X else
- X if (level < current_level)
- X {
- X current_level = level;
- X for (;num_jumps > 0 &&
- X future_jumps[num_jumps-1] >= CURRENT_LEVEL_START;
- X num_jumps--)
- X PUT_ADDR(future_jumps[num_jumps-1], pattern_offset);
- X }
- X switch (op)
- X {
- X case Rend:
- X break;
- X case Rnormal:
- X normal_char:
- X opcode = Cexact;
- X store_opcode_and_arg: /* opcode & ch must be set */
- X SET_LEVEL_START;
- X ALLOC(2);
- X STORE(opcode);
- X STORE(ch);
- X break;
- X case Ranychar:
- X opcode = Canychar;
- X store_opcode:
- X SET_LEVEL_START;
- X ALLOC(1);
- X STORE(opcode);
- X break;
- X case Rquote:
- X abort();
- X /*NOTREACHED*/
- X case Rbol:
- X if (!beginning_context)
- X if (regexp_context_indep_ops)
- X goto op_error;
- X else
- X goto normal_char;
- X opcode = Cbol;
- X goto store_opcode;
- X case Reol:
- X if (!((pos >= size) ||
- X ((regexp_syntax & RE_NO_BK_VBAR) ?
- X (regex[pos] == '\174') :
- X (pos+1 < size && regex[pos] == '\134' &&
- X regex[pos+1] == '\174')) ||
- X ((regexp_syntax & RE_NO_BK_PARENS)?
- X (regex[pos] == ')'):
- X (pos+1 < size && regex[pos] == '\134' &&
- X regex[pos+1] == ')'))))
- X if (regexp_context_indep_ops)
- X goto op_error;
- X else
- X goto normal_char;
- X opcode = Ceol;
- X goto store_opcode;
- X break;
- X case Roptional:
- X if (beginning_context)
- X if (regexp_context_indep_ops)
- X goto op_error;
- X else
- X goto normal_char;
- X if (CURRENT_LEVEL_START == pattern_offset)
- X break; /* ignore empty patterns for ? */
- X ALLOC(3);
- X INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
- X pattern_offset + 3);
- X break;
- X case Rstar:
- X case Rplus:
- X if (beginning_context)
- X if (regexp_context_indep_ops)
- X goto op_error;
- X else
- X goto normal_char;
- X if (CURRENT_LEVEL_START == pattern_offset)
- X break; /* ignore empty patterns for + and * */
- X ALLOC(9);
- X INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
- X pattern_offset + 6);
- X INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
- X if (op == Rplus) /* jump over initial failure_jump */
- X INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
- X CURRENT_LEVEL_START + 6);
- X break;
- X case Ror:
- X ALLOC(6);
- X INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
- X pattern_offset + 6);
- X if (num_jumps >= MAX_NESTING)
- X goto too_complex;
- X STORE(Cjump);
- X future_jumps[num_jumps++] = pattern_offset;
- X STORE(0);
- X STORE(0);
- X SET_LEVEL_START;
- X break;
- X case Ropenpar:
- X SET_LEVEL_START;
- X if (next_register < RE_NREGS)
- X {
- X bufp->uses_registers = 1;
- X ALLOC(2);
- X STORE(Cstart_memory);
- X STORE(next_register);
- X open_registers[num_open_registers++] = next_register;
- X next_register++;
- X }
- X paren_depth++;
- X PUSH_LEVEL_STARTS;
- X current_level = 0;
- X SET_LEVEL_START;
- X break;
- X case Rclosepar:
- X if (paren_depth <= 0)
- X goto parenthesis_error;
- X POP_LEVEL_STARTS;
- X current_level = regexp_precedences[Ropenpar];
- X paren_depth--;
- X if (paren_depth < num_open_registers)
- X {
- X bufp->uses_registers = 1;
- X ALLOC(2);
- X STORE(Cend_memory);
- X num_open_registers--;
- X STORE(open_registers[num_open_registers]);
- X }
- X break;
- X case Rmemory:
- X if (ch == '0')
- X goto bad_match_register;
- X assert(ch >= '0' && ch <= '9');
- X bufp->uses_registers = 1;
- X opcode = Cmatch_memory;
- X ch -= '0';
- X goto store_opcode_and_arg;
- X case Rextended_memory:
- X NEXTCHAR(ch);
- X if (ch < '0' || ch > '9')
- X goto bad_match_register;
- X NEXTCHAR(a);
- X if (a < '0' || a > '9')
- X goto bad_match_register;
- X ch = 10 * (a - '0') + ch - '0';
- X if (ch <= 0 || ch >= RE_NREGS)
- X goto bad_match_register;
- X bufp->uses_registers = 1;
- X opcode = Cmatch_memory;
- X goto store_opcode_and_arg;
- X case Ropenset:
- X {
- X int complement,prev,offset,range,firstchar;
- X
- X SET_LEVEL_START;
- X ALLOC(1+256/8);
- X STORE(Cset);
- X offset = pattern_offset;
- X for (a = 0; a < 256/8; a++)
- X STORE(0);
- X NEXTCHAR(ch);
- X if (translate)
- X ch = translate[(unsigned char)ch];
- X if (ch == '\136')
- X {
- X complement = 1;
- X NEXTCHAR(ch);
- X if (translate)
- X ch = translate[(unsigned char)ch];
- X }
- X else
- X complement = 0;
- X prev = -1;
- X range = 0;
- X firstchar = 1;
- X while (ch != '\135' || firstchar)
- X {
- X firstchar = 0;
- X if (regexp_ansi_sequences && ch == '\134')
- X {
- X NEXTCHAR(ch);
- X ANSI_TRANSLATE(ch);
- X }
- X if (range)
- X {
- X for (a = prev; a <= ch; a++)
- X SETBIT(pattern, offset, a);
- X prev = -1;
- X range = 0;
- X }
- X else
- X if (prev != -1 && ch == '-')
- X range = 1;
- X else
- X {
- X SETBIT(pattern, offset, ch);
- X prev = ch;
- X }
- X NEXTCHAR(ch);
- X if (translate)
- X ch = translate[(unsigned char)ch];
- X }
- X if (range)
- X SETBIT(pattern, offset, '-');
- X if (complement)
- X {
- X for (a = 0; a < 256/8; a++)
- X pattern[offset+a] ^= 0xff;
- X }
- X break;
- X }
- X case Rbegbuf:
- X opcode = Cbegbuf;
- X goto store_opcode;
- X case Rendbuf:
- X opcode = Cendbuf;
- X goto store_opcode;
- X case Rwordchar:
- X opcode = Csyntaxspec;
- X ch = Sword;
- X goto store_opcode_and_arg;
- X case Rnotwordchar:
- X opcode = Cnotsyntaxspec;
- X ch = Sword;
- X goto store_opcode_and_arg;
- X case Rwordbeg:
- X opcode = Cwordbeg;
- X goto store_opcode;
- X case Rwordend:
- X opcode = Cwordend;
- X goto store_opcode;
- X case Rwordbound:
- X opcode = Cwordbound;
- X goto store_opcode;
- X case Rnotwordbound:
- X opcode = Cnotwordbound;
- X goto store_opcode;
- X#ifdef emacs
- X case Remacs_at_dot:
- X opcode = Cemacs_at_dot;
- X goto store_opcode;
- X case Remacs_syntaxspec:
- X NEXTCHAR(ch);
- X if (translate)
- X ch = translate[(unsigned char)ch];
- X opcode = Csyntaxspec;
- X ch = syntax_spec_code[(unsigned char)ch];
- X goto store_opcode_and_arg;
- X case Remacs_notsyntaxspec:
- X NEXTCHAR(ch);
- X if (translate)
- X ch = translate[(unsigned char)ch];
- X opcode = Cnotsyntaxspec;
- X ch = syntax_spec_code[(unsigned char)ch];
- X goto store_opcode_and_arg;
- X#endif /* emacs */
- X default:
- X abort();
- X }
- X beginning_context = (op == Ropenpar || op == Ror);
- X }
- X if (starts_base != 0)
- X goto parenthesis_error;
- X assert(num_jumps == 0);
- X ALLOC(1);
- X STORE(Cend);
- X SET_FIELDS;
- X return NULL;
- X
- X op_error:
- X SET_FIELDS;
- X return "Badly placed special character";
- X
- X bad_match_register:
- X SET_FIELDS;
- X return "Bad match register number";
- X
- X hex_error:
- X SET_FIELDS;
- X return "Bad hexadecimal number";
- X
- X parenthesis_error:
- X SET_FIELDS;
- X return "Badly placed parenthesis";
- X
- X out_of_memory:
- X SET_FIELDS;
- X return "Out of memory";
- X
- X ends_prematurely:
- X SET_FIELDS;
- X return "Regular expression ends prematurely";
- X
- X too_complex:
- X SET_FIELDS;
- X return "Regular expression too complex";
- X}
- X#undef CHARAT
- X#undef NEXTCHAR
- X#undef GETHEX
- X#undef ALLOC
- X#undef STORE
- X#undef CURRENT_LEVEL_START
- X#undef SET_LEVEL_START
- X#undef PUSH_LEVEL_STARTS
- X#undef POP_LEVEL_STARTS
- X#undef PUT_ADDR
- X#undef INSERT_JUMP
- X#undef SETBIT
- X#undef SET_FIELDS
- X
- Xstatic void re_compile_fastmap_aux(code, pos, visited, can_be_null, fastmap)
- Xchar *code, *visited, *can_be_null, *fastmap;
- Xint pos;
- X{
- X int a, b, syntaxcode;
- X
- X if (visited[pos])
- X return; /* we have already been here */
- X visited[pos] = 1;
- X for (;;)
- X switch (code[pos++])
- X {
- X case Cend:
- X *can_be_null = 1;
- X return;
- X case Cbol:
- X case Cbegbuf:
- X case Cendbuf:
- X case Cwordbeg:
- X case Cwordend:
- X case Cwordbound:
- X case Cnotwordbound:
- X#ifdef emacs
- X case Cemacs_at_dot:
- X#endif /* emacs */
- X break;
- X case Csyntaxspec:
- X syntaxcode = code[pos++];
- X for (a = 0; a < 256; a++)
- X if (SYNTAX(a) == syntaxcode)
- X fastmap[a] = 1;
- X return;
- X case Cnotsyntaxspec:
- X syntaxcode = code[pos++];
- X for (a = 0; a < 256; a++)
- X if (SYNTAX(a) != syntaxcode)
- X fastmap[a] = 1;
- X return;
- X case Ceol:
- X fastmap['\n'] = 1;
- X if (*can_be_null == 0)
- X *can_be_null = 2; /* can match null, but only at end of buffer*/
- X return;
- X case Cset:
- X for (a = 0; a < 256/8; a++)
- X if (code[pos + a] != 0)
- X for (b = 0; b < 8; b++)
- X if (code[pos + a] & (1 << b))
- X fastmap[(a << 3) + b] = 1;
- X pos += 256/8;
- X return;
- X case Cexact:
- X fastmap[(unsigned char)code[pos]] = 1;
- X return;
- X case Canychar:
- X for (a = 0; a < 256; a++)
- X if (a != '\n')
- X fastmap[a] = 1;
- X return;
- X case Cstart_memory:
- X case Cend_memory:
- X pos++;
- X break;
- X case Cmatch_memory:
- X /* should this ever happen for sensible patterns??? */
- X *can_be_null = 1;
- X return;
- X case Cjump:
- X case Cdummy_failure_jump:
- X case Cupdate_failure_jump:
- X case Cstar_jump:
- X a = (unsigned char)code[pos++];
- X a |= (unsigned char)code[pos++] << 8;
- X pos += (int)(short)a;
- X if (visited[pos])
- X {
- X /* argh... the regexp contains empty loops. This is not
- X good, as this may cause a failure stack overflow when
- X matching. Oh well. */
- X /* this path leads nowhere; pursue other paths. */
- X return;
- X }
- X visited[pos] = 1;
- X break;
- X case Cfailure_jump:
- X a = (unsigned char)code[pos++];
- X a |= (unsigned char)code[pos++] << 8;
- X a = pos + (int)(short)a;
- X re_compile_fastmap_aux(code, a, visited, can_be_null, fastmap);
- X break;
- X default:
- X abort(); /* probably some opcode is missing from this switch */
- X /*NOTREACHED*/
- X }
- X}
- X
- Xstatic int re_do_compile_fastmap(buffer, used, pos, can_be_null, fastmap)
- Xchar *buffer, *fastmap, *can_be_null;
- Xint used, pos;
- X{
- X char small_visited[512], *visited;
- X
- X if (used <= sizeof(small_visited))
- X visited = small_visited;
- X else
- X {
- X visited = malloc(used);
- X if (!visited)
- X return 0;
- X }
- X *can_be_null = 0;
- X memset(fastmap, 0, 256);
- X memset(visited, 0, used);
- X re_compile_fastmap_aux(buffer, pos, visited, can_be_null, fastmap);
- X if (visited != small_visited)
- X free(visited);
- X return 1;
- X}
- X
- Xvoid re_compile_fastmap(bufp)
- Xregexp_t bufp;
- X{
- X if (!bufp->fastmap || bufp->fastmap_accurate)
- X return;
- X assert(bufp->used > 0);
- X if (!re_do_compile_fastmap(bufp->buffer, bufp->used, 0, &bufp->can_be_null,
- X bufp->fastmap))
- X return;
- X if (bufp->buffer[0] == Cbol)
- X bufp->anchor = 1; /* begline */
- X else
- X if (bufp->buffer[0] == Cbegbuf)
- X bufp->anchor = 2; /* begbuf */
- X else
- X bufp->anchor = 0; /* none */
- X bufp->fastmap_accurate = 1;
- X}
- X
- X#define INITIAL_FAILURES 128 /* initial # failure points to allocate */
- X#define MAX_FAILURES 4100 /* max # of failure points before failing */
- X
- Xint re_match_2(bufp, string1, size1, string2, size2, pos, regs, mstop)
- Xregexp_t bufp;
- Xchar *string1, *string2;
- Xint size1, size2, pos, mstop;
- Xregexp_registers_t regs;
- X{
- X struct failure_point { char *text, *partend, *code; }
- X *failure_stack_start, *failure_sp, *failure_stack_end,
- X initial_failure_stack[INITIAL_FAILURES];
- X char *code, *translate, *text, *textend, *partend, *part_2_end;
- X char *regstart_text[RE_NREGS], *regstart_partend[RE_NREGS];
- X char *regend_text[RE_NREGS], *regend_partend[RE_NREGS];
- X int a, b, ch, reg, regch, match_end;
- X char *regtext, *regpartend, *regtextend;
- X
- X#define PREFETCH \
- X MACRO_BEGIN \
- X if (text == partend) \
- X { \
- X if (text == textend) \
- X goto fail; \
- X text = string2; \
- X partend = part_2_end; \
- X } \
- X MACRO_END
- X
- X#define NEXTCHAR(var) \
- X MACRO_BEGIN \
- X PREFETCH; \
- X (var) = (unsigned char)*text++; \
- X if (translate) \
- X (var) = (unsigned char)translate[(var)]; \
- X MACRO_END
- X
- X assert(pos >= 0 && size1 >= 0 && size2 >= 0 && mstop >= 0);
- X assert(mstop <= size1 + size2);
- X assert(pos <= mstop);
- X
- X if (pos <= size1)
- X {
- X text = string1 + pos;
- X if (mstop <= size1)
- X {
- X partend = string1 + mstop;
- X textend = partend;
- X }
- X else
- X {
- X partend = string1 + size1;
- X textend = string2 + mstop - size1;
- X }
- X part_2_end = string2 + mstop - size1;
- X }
- X else
- X {
- X text = string2 + pos - size1;
- X partend = string2 + mstop - size1;
- X textend = partend;
- X part_2_end = partend;
- X }
- X
- X if (bufp->uses_registers && regs != NULL)
- X for (a = 0; a < RE_NREGS; a++)
- X regend_text[a] = NULL;
- X
- X code = bufp->buffer;
- X translate = bufp->translate;
- X failure_stack_start = failure_sp = initial_failure_stack;
- X failure_stack_end = initial_failure_stack + INITIAL_FAILURES;
- X
- X#if 0
- X /* re_search_2 has already done this, and otherwise we get little benefit
- X from this. So I'll leave this out. */
- X if (bufp->fastmap_accurate && !bufp->can_be_null &&
- X text != textend &&
- X !bufp->fastmap[translate ?
- X (unsigned char)translate[(unsigned char)*text] :
- X (unsigned char)*text])
- X return -1; /* it can't possibly match */
- X#endif
- X
- X continue_matching:
- X for (;;)
- X {
- X switch (*code++)
- X {
- X case Cend:
- X if (partend != part_2_end)
- X match_end = text - string1;
- X else
- X match_end = text - string2 + size1;
- X if (regs)
- X {
- X regs->start[0] = pos;
- X regs->end[0] = match_end;
- X if (!bufp->uses_registers)
- X {
- X for (a = 1; a < RE_NREGS; a++)
- X {
- X regs->start[a] = -1;
- X regs->end[a] = -1;
- X }
- X }
- X else
- X {
- X for (a = 1; a < RE_NREGS; a++)
- X {
- X if (regend_text[a] == NULL)
- X {
- X regs->start[a] = -1;
- X regs->end[a] = -1;
- X continue;
- X }
- X if (regstart_partend[a] != part_2_end)
- X regs->start[a] = regstart_text[a] - string1;
- X else
- X regs->start[a] = regstart_text[a] - string2 + size1;
- X if (regend_partend[a] != part_2_end)
- X regs->end[a] = regend_text[a] - string1;
- X else
- X regs->end[a] = regend_text[a] - string2 + size1;
- X }
- X }
- X }
- X if (failure_stack_start != initial_failure_stack)
- X free((char *)failure_stack_start);
- X return match_end - pos;
- X case Cbol:
- X if (text == string1 || text[-1] == '\n') /* text[-1] always valid */
- X break;
- X goto fail;
- X case Ceol:
- X if (text == string2 + size2 ||
- X (text == string1 + size1 ?
- X (size2 == 0 || *string2 == '\n') :
- X *text == '\n'))
- X break;
- X goto fail;
- X case Cset:
- X NEXTCHAR(ch);
- X if (code[ch/8] & (1<<(ch & 7)))
- X {
- X code += 256/8;
- X break;
- X }
- X goto fail;
- X case Cexact:
- X NEXTCHAR(ch);
- X if (ch != (unsigned char)*code++)
- X goto fail;
- X break;
- X case Canychar:
- X NEXTCHAR(ch);
- X if (ch == '\n')
- X goto fail;
- X break;
- X case Cstart_memory:
- X reg = *code++;
- X regstart_text[reg] = text;
- X regstart_partend[reg] = partend;
- X break;
- X case Cend_memory:
- X reg = *code++;
- X regend_text[reg] = text;
- X regend_partend[reg] = partend;
- X break;
- X case Cmatch_memory:
- X reg = *code++;
- X if (regend_text[reg] == NULL)
- X goto fail; /* or should we just match nothing? */
- X regtext = regstart_text[reg];
- X regtextend = regend_text[reg];
- X if (regstart_partend[reg] == regend_partend[reg])
- X regpartend = regtextend;
- X else
- X regpartend = string1 + size1;
- X
- X for (;regtext != regtextend;)
- X {
- X NEXTCHAR(ch);
- X if (regtext == regpartend)
- X regtext = string2;
- X regch = (unsigned char)*regtext++;
- X if (translate)
- X regch = (unsigned char)translate[regch];
- X if (regch != ch)
- X goto fail;
- X }
- X break;
- X case Cstar_jump:
- X /* star is coded as:
- X 1: failure_jump 2
- X ... code for operand of star
- X star_jump 1
- X 2: ... code after star
- X We change the star_jump to update_failure_jump if we can determine
- X that it is safe to do so; otherwise we change it to an ordinary
- X jump.
- X plus is coded as
- X jump 2
- X 1: failure_jump 3
- X 2: ... code for operand of plus
- X star_jump 1
- X 3: ... code after plus
- X For star_jump considerations this is processed identically
- X to star. */
- X a = (unsigned char)*code++;
- X a |= (unsigned char)*code++ << 8;
- X a = (int)(short)a;
- X {
- X char map[256], can_be_null;
- X char *p1, *p2;
- X
- X p1 = code + a + 3; /* skip the failure_jump */
- X assert(p1[-3] == Cfailure_jump);
- X p2 = code;
- X /* p1 points inside loop, p2 points to after loop */
- X if (!re_do_compile_fastmap(bufp->buffer, bufp->used,
- X p2 - bufp->buffer, &can_be_null, map))
- X goto make_normal_jump;
- X /* If we might introduce a new update point inside the loop,
- X we can't optimize because then update_jump would update a
- X wrong failure point. Thus we have to be quite careful here. */
- X loop_p1:
- X /* loop until we find something that consumes a character */
- X switch (*p1++)
- X {
- X case Cbol:
- X case Ceol:
- X case Cbegbuf:
- X case Cendbuf:
- X case Cwordbeg:
- X case Cwordend:
- X case Cwordbound:
- X case Cnotwordbound:
- X#ifdef emacs
- X case Cemacs_at_dot:
- X#endif /* emacs */
- X goto loop_p1;
- X case Cstart_memory:
- X case Cend_memory:
- X p1++;
- X goto loop_p1;
- X case Cexact:
- X ch = (unsigned char)*p1++;
- X if (map[ch])
- X goto make_normal_jump;
- X break;
- X case Canychar:
- X for (b = 0; b < 256; b++)
- X if (b != '\n' && map[b])
- X goto make_normal_jump;
- X break;
- X case Cset:
- X for (b = 0; b < 256; b++)
- X if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
- X goto make_normal_jump;
- X p1 += 256/8;
- X break;
- X default:
- X goto make_normal_jump;
- X }
- X /* now we know that we can't backtrack. */
- X while (p1 != p2 - 3)
- X {
- X switch (*p1++)
- X {
- X case Cend:
- X abort(); /* we certainly shouldn't get this inside loop */
- X /*NOTREACHED*/
- X case Cbol:
- X case Ceol:
- X case Canychar:
- X case Cbegbuf:
- X case Cendbuf:
- X case Cwordbeg:
- X case Cwordend:
- X case Cwordbound:
- X case Cnotwordbound:
- X#ifdef emacs
- X case Cemacs_at_dot:
- X#endif /* emacs */
- X break;
- X case Cset:
- X p1 += 256/8;
- X break;
- X case Cexact:
- X case Cstart_memory:
- X case Cend_memory:
- X case Cmatch_memory:
- X case Csyntaxspec:
- X case Cnotsyntaxspec:
- X p1++;
- X break;
- X case Cjump:
- X case Cstar_jump:
- X case Cfailure_jump:
- X case Cupdate_failure_jump:
- X case Cdummy_failure_jump:
- X goto make_normal_jump;
- X default:
- X printf("regexpr.c: processing star_jump: unknown op %d\n", p1[-1]);
- X break;
- X }
- X }
- X goto make_update_jump;
- X }
- X make_normal_jump:
- X /* printf("changing to normal jump\n"); */
- X code -= 3;
- X *code = Cjump;
- X break;
- X make_update_jump:
- X /* printf("changing to update jump\n"); */
- X code -= 2;
- X a += 3; /* jump to after the Cfailure_jump */
- X code[-1] = Cupdate_failure_jump;
- X code[0] = a & 0xff;
- X code[1] = a >> 8;
- X /* fall to next case */
- X case Cupdate_failure_jump:
- X failure_sp[-1].text = text;
- X failure_sp[-1].partend = partend;
- X /* fall to next case */
- X case Cjump:
- X a = (unsigned char)*code++;
- X a |= (unsigned char)*code++ << 8;
- X code += (int)(short)a;
- X break;
- X case Cdummy_failure_jump:
- X case Cfailure_jump:
- X if (failure_sp == failure_stack_end)
- X {
- X if (failure_stack_start != initial_failure_stack)
- X goto error;
- X failure_stack_start = (struct failure_point *)
- X malloc(MAX_FAILURES * sizeof(*failure_stack_start));
- X failure_stack_end = failure_stack_start + MAX_FAILURES;
- X memcpy((char *)failure_stack_start, (char *)initial_failure_stack,
- X INITIAL_FAILURES * sizeof(*failure_stack_start));
- X failure_sp = failure_stack_start + INITIAL_FAILURES;
- X }
- X a = (unsigned char)*code++;
- X a |= (unsigned char)*code++ << 8;
- X a = (int)(short)a;
- X if (code[-3] == Cdummy_failure_jump)
- X { /* this is only used in plus */
- X assert(*code == Cfailure_jump);
- X b = (unsigned char)code[1];
- X b |= (unsigned char)code[2] << 8;
- X failure_sp->code = code + (int)(short)b + 3;
- X failure_sp->text = NULL;
- X code += a;
- X }
- X else
- X {
- X failure_sp->code = code + a;
- X failure_sp->text = text;
- X failure_sp->partend = partend;
- X }
- X failure_sp++;
- X break;
- X case Cbegbuf:
- X if (text == string1)
- X break;
- X goto fail;
- X case Cendbuf:
- X if (size2 == 0 ? text == string1 + size1 : text == string2 + size2)
- X break;
- X goto fail;
- X case Cwordbeg:
- X if (text == string2 + size2)
- X goto fail;
- X if (size2 == 0 && text == string1 + size1)
- X goto fail;
- X if (SYNTAX(text == string1 + size1 ? *string1 : *text) != Sword)
- X goto fail;
- X if (text == string1)
- X break;
- X if (SYNTAX(text[-1]) != Sword)
- X break;
- X goto fail;
- X case Cwordend:
- X if (text == string1)
- X goto fail;
- X if (SYNTAX(text[-1]) != Sword)
- X goto fail;
- X if (text == string2 + size2)
- X break;
- X if (size2 == 0 && text == string1 + size1)
- X break;
- X if (SYNTAX(*text) == Sword)
- X goto fail;
- X break;
- X case Cwordbound:
- X /* Note: as in gnu regexp, this also matches at the beginning
- X and end of buffer. */
- X if (text == string1 || text == string2 + size2 ||
- X (size2 == 0 && text == string1 + size1))
- X break;
- X if ((SYNTAX(text[-1]) == Sword) ^
- X (SYNTAX(text == string1 + size1 ? *string2 : *text) == Sword))
- X break;
- X goto fail;
- X case Cnotwordbound:
- X /* Note: as in gnu regexp, this never matches at the beginning
- X and end of buffer. */
- X if (text == string1 || text == string2 + size2 ||
- X (size2 == 0 && text == string1 + size1))
- X goto fail;
- X if (!((SYNTAX(text[-1]) == Sword) ^
- X (SYNTAX(text == string1 + size1 ? *string2 : *text) == Sword)))
- X goto fail;
- X break;
- X case Csyntaxspec:
- X NEXTCHAR(ch);
- X if (SYNTAX(ch) != (unsigned char)*code++)
- X goto fail;
- X break;
- X case Cnotsyntaxspec:
- X NEXTCHAR(ch);
- X if (SYNTAX(ch) != (unsigned char)*code++)
- X break;
- X goto fail;
- X#ifdef emacs
- X case Cemacs_at_dot:
- X if (PTR_CHAR_POS((unsigned char *)text) + 1 != point)
- X goto fail;
- X break;
- X#endif /* emacs */
- X default:
- X abort();
- X /*NOTREACHED*/
- X }
- X }
- X abort();
- X /*NOTREACHED*/
- X
- X fail:
- X if (failure_sp != failure_stack_start)
- X {
- X failure_sp--;
- X text = failure_sp->text;
- X if (text == NULL)
- X goto fail;
- X partend = failure_sp->partend;
- X code = failure_sp->code;
- X goto continue_matching;
- X }
- X if (failure_stack_start != initial_failure_stack)
- X free((char *)failure_stack_start);
- X return -1;
- X
- X error:
- X if (failure_stack_start != initial_failure_stack)
- X free((char *)failure_stack_start);
- X return -2;
- X}
- X
- X#undef PREFETCH
- X#undef NEXTCHAR
- X#undef PUSH_FAILURE
- X
- Xint re_match(bufp, string, size, pos, regs)
- Xregexp_t bufp;
- Xchar *string;
- Xint size, pos;
- Xregexp_registers_t regs;
- X{
- X return re_match_2(bufp, string, size, (char *)NULL, 0, pos, regs, size);
- X}
- X
- Xint re_search_2(bufp, string1, size1, string2, size2, pos, range, regs,
- X mstop)
- Xregexp_t bufp;
- Xchar *string1, *string2;
- Xint size1, size2, pos, range, mstop;
- Xregexp_registers_t regs;
- X{
- X char *fastmap, *translate, *text, *partstart, *partend;
- X int dir, ret;
- X char anchor;
- X
- X assert(size1 >= 0 && size2 >= 0 && pos >= 0 && mstop >= 0);
- X assert(pos + range + 1 >= 0 && pos + range - 1 <= size1 + size2);
- X assert(pos <= mstop);
- X
- X fastmap = bufp->fastmap;
- X translate = bufp->translate;
- X if (fastmap && !bufp->fastmap_accurate)
- X re_compile_fastmap(bufp);
- X anchor = bufp->anchor;
- X if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
- X fastmap = NULL;
- X if (range < 0)
- X {
- X dir = -1;
- X range = -range;
- X }
- X else
- X dir = 1;
- X if (anchor == 2)
- X if (pos != 0)
- X return -1;
- X else
- X range = 0;
- X for (; range >= 0; range--, pos += dir)
- X {
- X if (fastmap)
- X {
- X if (dir == 1)
- X { /* searching forwards */
- X if (pos < size1)
- X {
- X text = string1 + pos;
- X if (pos + range > size1)
- X partend = string1 + size1;
- X else
- X partend = string1 + pos + range;
- X }
- X else
- X {
- X text = string2 + pos - size1;
- X partend = string2 + pos + range - size1;
- X }
- X partstart = text;
- X if (translate)
- X while (text != partend &&
- X !fastmap[(unsigned char)
- X translate[(unsigned char)*text]])
- X text++;
- X else
- X while (text != partend && !fastmap[(unsigned char)*text])
- X text++;
- X pos += text - partstart;
- X range -= text - partstart;
- X if (pos == size1 + size2 && bufp->can_be_null == 0)
- X return -1;
- X }
- X else
- X { /* searching backwards */
- X if (pos <= size1)
- X {
- X text = string1 + pos;
- X partstart = string1 + pos - range;
- X }
- X else
- X {
- X text = string2 + pos - size1;
- X if (range < pos - size1)
- X partstart = string2 + pos - size1 - range;
- X else
- X partstart = string2;
- X }
- X partend = text;
- X if (translate)
- X while (text != partstart &&
- X !fastmap[(unsigned char)
- X translate[(unsigned char)*text]])
- X text--;
- X else
- X while (text != partstart &&
- X !fastmap[(unsigned char)*text])
- X text--;
- X pos -= partend - text;
- X range -= partend - text;
- X }
- X }
- X if (anchor == 1)
- X { /* anchored to begline */
- X if (pos > 0 &&
- X (pos <= size1 ? string1[pos - 1] :
- X string2[pos - size1 - 1]) != '\n')
- X continue;
- X }
- X assert(pos >= 0 && pos <= size1 + size2);
- X ret = re_match_2(bufp, string1, size1, string2, size2, pos, regs, mstop);
- X if (ret >= 0)
- X return pos;
- X if (ret == -2)
- X return -2;
- X }
- X return -1;
- X}
- X
- Xint re_search(bufp, string, size, startpos, range, regs)
- Xregexp_t bufp;
- Xchar *string;
- Xint size, startpos, range;
- Xregexp_registers_t regs;
- X{
- X return re_search_2(bufp, string, size, (char *)NULL, 0,
- X startpos, range, regs, size);
- X}
- X
- Xstatic struct re_pattern_buffer re_comp_buf;
- X
- Xchar *re_comp(s)
- Xchar *s;
- X{
- X if (s == NULL)
- X {
- X if (!re_comp_buf.buffer)
- X return "Out of memory";
- X return NULL;
- X }
- X if (!re_comp_buf.buffer)
- X {
- X /* the buffer will be allocated automatically */
- X re_comp_buf.fastmap = malloc(256);
- X re_comp_buf.translate = NULL;
- X }
- X return re_compile_pattern(s, strlen(s), &re_comp_buf);
- X}
- X
- Xint re_exec(s)
- Xchar *s;
- X{
- X int len = strlen(s);
- X
- X return re_search(&re_comp_buf, s, len, 0, len, (regexp_registers_t)NULL) >= 0;
- X}
- X
- X#ifdef TEST_REGEXP
- X
- Xint main()
- X{
- X char buf[500];
- X char *cp;
- X struct re_pattern_buffer exp;
- X struct re_registers regs;
- X int a,pos;
- X char fastmap[256];
- X
- X exp.allocated = 0;
- X exp.buffer = 0;
- X exp.translate = NULL;
- X exp.fastmap = fastmap;
- X
- X /* re_set_syntax(RE_NO_BK_PARENS|RE_NO_BK_VBAR|RE_ANSI_HEX); */
- X
- X while (1)
- X {
- X printf("Enter regexp:\n");
- X gets(buf);
- X cp=re_compile_pattern(buf, strlen(buf), &exp);
- X if (cp)
- X {
- X printf("Error: %s\n", cp);
- X continue;
- X }
- X re_compile_fastmap(&exp);
- X printf("dump:\n");
- X for (pos = 0; pos < exp.used;)
- X {
- X printf("%d: ", pos);
- X switch (exp.buffer[pos++])
- X {
- X case Cend:
- X strcpy(buf, "end");
- X break;
- X case Cbol:
- X strcpy(buf, "bol");
- X break;
- X case Ceol:
- X strcpy(buf, "eol");
- X break;
- X case Cset:
- X strcpy(buf, "set ");
- X for (a = 0; a < 256/8; a++)
- X sprintf(buf+strlen(buf)," %02x",
- X (unsigned char)exp.buffer[pos++]);
- X break;
- X case Cexact:
- X sprintf(buf, "exact '%c' 0x%x", exp.buffer[pos],
- X (unsigned char)exp.buffer[pos]);
- X pos++;
- X break;
- X case Canychar:
- X strcpy(buf, "anychar");
- X break;
- X case Cstart_memory:
- X sprintf(buf, "start_memory %d", exp.buffer[pos++]);
- X break;
- X case Cend_memory:
- X sprintf(buf, "end_memory %d", exp.buffer[pos++]);
- X break;
- X case Cmatch_memory:
- X sprintf(buf, "match_memory %d", exp.buffer[pos++]);
- X break;
- X case Cjump:
- X case Cdummy_failure_jump:
- X case Cstar_jump:
- X case Cfailure_jump:
- X case Cupdate_failure_jump:
- X a = (unsigned char)exp.buffer[pos++];
- X a += (unsigned char)exp.buffer[pos++] << 8;
- X a = (int)(short)a;
- X switch (exp.buffer[pos-3])
- X {
- X case Cjump:
- X cp = "jump";
- X break;
- X case Cstar_jump:
- X cp = "star_jump";
- X break;
- X case Cfailure_jump:
- X cp = "failure_jump";
- X break;
- X case Cupdate_failure_jump:
- X cp = "update_failure_jump";
- X break;
- X case Cdummy_failure_jump:
- X cp = "dummy_failure_jump";
- X break;
- X default:
- X cp = "unknown jump";
- X break;
- X }
- X sprintf(buf, "%s %d", cp, a + pos);
- X break;
- X case Cbegbuf:
- X strcpy(buf,"begbuf");
- X break;
- X case Cendbuf:
- X strcpy(buf,"endbuf");
- X break;
- X case Cwordbeg:
- X strcpy(buf,"wordbeg");
- X break;
- X case Cwordend:
- X strcpy(buf,"wordend");
- X break;
- X case Cwordbound:
- X strcpy(buf,"wordbound");
- X break;
- X case Cnotwordbound:
- X strcpy(buf,"notwordbound");
- X break;
- X default:
- X sprintf(buf, "unknown code %d",
- X (unsigned char)exp.buffer[pos - 1]);
- X break;
- X }
- X printf("%s\n", buf);
- X }
- X printf("can_be_null = %d uses_registers = %d anchor = %d\n",
- X exp.can_be_null, exp.uses_registers, exp.anchor);
- X
- X printf("fastmap:");
- X for (a = 0; a < 256; a++)
- X if (exp.fastmap[a])
- X printf(" %d", a);
- X printf("\n");
- X printf("Enter strings. An empty line terminates.\n");
- X while (fgets(buf, sizeof(buf), stdin))
- X {
- X if (buf[0] == '\n')
- X break;
- X a = re_search(&exp, buf, strlen(buf), 0, strlen(buf), ®s);
- X printf("search returns %d\n", a);
- X if (a != -1)
- X {
- X for (a = 0; a < RE_NREGS; a++)
- X {
- X printf("buf %d: %d to %d\n", a, regs.start[a], regs.end[a]);
- X }
- X }
- X }
- X }
- X}
- X
- X#endif /* TEST_REGEXP */
- END_OF_regexpr.c
- if test 41626 -ne `wc -c <regexpr.c`; then
- echo shar: \"regexpr.c\" unpacked with wrong size!
- fi
- # end of overwriting check
- fi
- echo shar: End of shell archive.
- exit 0
-
- exit 0 # Just in case...
- --
- Kent Landfield INTERNET: kent@sparky.IMD.Sterling.COM
- Sterling Software, IMD UUCP: uunet!sparky!kent
- Phone: (402) 291-8300 FAX: (402) 291-4362
- Please send comp.sources.misc-related mail to kent@uunet.uu.net.
-